import pandas as pd
import numpy as np
import shap
import copy
import glob
import re
import pickle
import datetime as dt
from datetime import timezone
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.cluster import KMeans
import plotly.express as px
from sklearn.metrics import roc_curve, precision_recall_curve
import matplotlib.pyplot as plt
from sklearn.metrics import get_scorer_names, classification_report, make_scorer,accuracy_score, recall_score
from sklearn.model_selection import cross_val_score, cross_validate ,RepeatedStratifiedKFold, train_test_split
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
def kfold_report(model, X,Y):
score_est = {}
for score in ["roc_auc", "f1", "precision", "recall", "accuracy"]:
cvs = cross_val_score(model, X, Y, scoring=score, cv=10).mean().round(3)
score_est[score] = cvs # Storing like a dictionay
print('\n Average Score from cross validation with cv=10 \n', model,':', score_est, '\n')
originalclass = [] # to store true label for each iteration
predictedclass = [] # to store predicted label for each iteration
# in cross_val_score this function will be called each for each training
def custom_scoring_function(y_true, y_pred):
#print(classification_report(y_true, y_pred)) # print classification report for each iteration
originalclass.extend(y_true)
predictedclass.extend(y_pred)
return recall_score(y_true, y_pred) # return accuracy score , here we can pass f1_score as well with average parameter
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=1)
nested_score = cross_val_score(model, X=X, y=Y, cv=cv, scoring=make_scorer(custom_scoring_function))
print("Mean Recall from Kfold : %.3f" %np.mean(nested_score))
print('Over all classification report')
print(classification_report(originalclass, predictedclass))
return originalclass, predictedclass
# compute true positive rate and false positive rate
def plot_roc_curve(y_test, y_pred):
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
# plotting them against each other
plt.figure(figsize=(8, 3))
plt.plot(false_positive_rate, true_positive_rate, linewidth=2, label="plot_roc_curve")
plt.plot([0, 1], [0, 1], 'r', linewidth=4)
plt.axis([0, 1, 0, 1])
plt.xlabel('False Positive Rate (FPR)', fontsize=16)
plt.ylabel('True Positive Rate (TPR)', fontsize=16)
plt.show()
# getting the probabilities of our predictions
def plot_precision_recall_curve(y_test, y_pred):
precision, recall, threshold = precision_recall_curve(y_test, y_pred)
plt.figure(figsize=(8, 3))
plt.plot(threshold, precision[:-1], "r-", label="precision", linewidth=5)
plt.plot(threshold, recall[:-1], "b", label="recall", linewidth=5)
plt.xlabel("threshold", fontsize=19)
plt.legend(loc="upper right", fontsize=19)
plt.ylim([0, 1])
plt.show()
data = pd.read_csv('data/1.1-mu-process-data-05-09-22.csv', index_col=0)
print(data.shape)
df =data.groupby('label').agg({'dev_id':'count'}).reset_index()
df['dev_per'] = round(100*df['dev_id']/sum(df['dev_id']),2)
df = df.sort_values(by='dev_id', ascending=False)
fig = px.bar(df, x='label', y='dev_id',
hover_data=['dev_id', 'dev_per'], text='dev_per',
labels={'label':'Is HQ Developer?','dev_id':'Number of Developers'}, height=400)
fig.update_xaxes(type='category')
fig.show()
data.head(5)
(49477, 21)
| dev_id | source_attribution_type | user_os_type | quiz_answer | resume_flag | num_chars_resume | years_of_experience | years_of_remote_experience | english_communication | role_type | sn_avg_score | num_self_skills | num_self_beginner_skills | num_self_intermediate_skills | num_self_advanced_skills | num_self_expert_skills | time_to_upload_resume | time_to_sn_test | label | is_github | is_quiz_answer_correct | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2565330 | Linux | MAX_NUM += 1 | True | 3557.0 | 10.0 | 5.0 | Great | Fullstack (BE-heavy) | 4.707500 | 20.0 | 2.0 | 8.0 | 4.0 | 6.0 | 0.0 | 0.0 | 0 | 0 | 0 | |
| 1 | 630845 | Android OS | MAX_NUM = NUM | True | 1720.0 | 5.0 | 5.0 | Great | Fullstack (BE-heavy) | 3.805000 | 19.0 | 0.0 | 6.0 | 5.0 | 8.0 | 1.0 | 1.0 | 0 | 1 | 1 | |
| 2 | 817987 | Android OS | no-answer | True | 1606.0 | 1.0 | 0.0 | Great | Other | 2.577500 | 10.0 | 10.0 | 0.0 | 0.0 | 0.0 | 0.0 | 46.0 | 0 | 1 | 0 | |
| 3 | 1402917 | Outbound | Windows | MAX_NUM = NUM | True | 2973.0 | 4.0 | 2.0 | Average | Web Frontend | 3.734166 | 14.0 | 2.0 | 4.0 | 6.0 | 2.0 | 0.0 | 0.0 | 0 | 0 | 1 |
| 5 | 2347395 | Undefined | Android OS | MAX_NUM += 1 | True | 2768.0 | 5.0 | 3.0 | Great | Machine Learning | 3.894166 | 22.0 | 0.0 | 7.0 | 4.0 | 11.0 | 0.0 | 10.0 | 0 | 0 | 0 |
data_self_skill = pd.read_csv('data/self-skill-pivot.csv')
data_self_skill_piv = data_self_skill.pivot_table(index='developer_id', columns='skill_name', values='skill_level_int').reset_index()
data_self_skill_piv.index.name = None
data_self_skill_piv.columns.name = None
data_self_skill_piv.fillna(0, inplace=True)
print(data_self_skill_piv.shape)
data_self_skill_piv.rename(columns={'developer_id':'dev_id'}, inplace=True)
data_self_skill_piv = data_self_skill_piv.merge(data[['dev_id', 'label']], how='left', on='dev_id')
data_self_skill_piv.dropna(inplace=True)
col_fe = list(data_self_skill_piv.columns)
col_fe.remove('dev_id')
col_fe.remove('label')
(55238, 847)
from sklearn.feature_selection import SelectKBest, chi2
X_fe = data_self_skill_piv[col_fe]
Y_fe = data_self_skill_piv['label']
fs = SelectKBest(score_func=chi2, k=10)
fs.fit(X_fe, Y_fe)
#X_fe_final = fs.transform(X_fe) # retun numpy array # fs.get_feature_names_out()
X_fe_final = X_fe.iloc[:,fs.get_support(indices=True)].copy()
top_skill_algo = X_fe_final.columns
top_skill_self = ['dev_id', 'JavaScript', 'Python', 'SQL', 'React', 'Git','Node.js']
top_skill_self.extend(top_skill_algo)
top_skill = list(set(top_skill_self))
#top_skill.extend(['dev_id'])
top_skill_data = data_self_skill_piv[top_skill]
top_skill = ['self_skill_'+ val if val!='dev_id' else val for val in top_skill_data.columns]
top_skill_data.columns = top_skill
final_data = data.merge(top_skill_data, how='left', on='dev_id')
final_data.to_csv('data/aggregated_data.csv')
final_data.head()
| dev_id | source_attribution_type | user_os_type | quiz_answer | resume_flag | num_chars_resume | years_of_experience | years_of_remote_experience | english_communication | role_type | sn_avg_score | num_self_skills | num_self_beginner_skills | num_self_intermediate_skills | num_self_advanced_skills | num_self_expert_skills | time_to_upload_resume | time_to_sn_test | label | is_github | is_quiz_answer_correct | self_skill_MySQL | self_skill_Express.js | self_skill_Laravel | self_skill_PHP | self_skill_REST/RESTful APIs | self_skill_SQL | self_skill_Node.js | self_skill_Vue.js | self_skill_PHP/MySQL | self_skill_JavaScript | self_skill_React | self_skill_PHP, Laravel | self_skill_Git | self_skill_Python | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2565330 | Linux | MAX_NUM += 1 | True | 3557.0 | 10.0 | 5.0 | Great | Fullstack (BE-heavy) | 4.707500 | 20.0 | 2.0 | 8.0 | 4.0 | 6.0 | 0.0 | 0.0 | 0 | 0 | 0 | 4.0 | 0.0 | 4.0 | 4.0 | 1.0 | 0.0 | 0.0 | 3.0 | 0.0 | 3.0 | 0.0 | 0.0 | 4.0 | 0.0 | |
| 1 | 630845 | Android OS | MAX_NUM = NUM | True | 1720.0 | 5.0 | 5.0 | Great | Fullstack (BE-heavy) | 3.805000 | 19.0 | 0.0 | 6.0 | 5.0 | 8.0 | 1.0 | 1.0 | 0 | 1 | 1 | 2.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 4.0 | 0.0 | 0.0 | 4.0 | 0.0 | 0.0 | 4.0 | 0.0 | |
| 2 | 817987 | Android OS | no-answer | True | 1606.0 | 1.0 | 0.0 | Great | Other | 2.577500 | 10.0 | 10.0 | 0.0 | 0.0 | 0.0 | 0.0 | 46.0 | 0 | 1 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | |
| 3 | 1402917 | Outbound | Windows | MAX_NUM = NUM | True | 2973.0 | 4.0 | 2.0 | Average | Web Frontend | 3.734166 | 14.0 | 2.0 | 4.0 | 6.0 | 2.0 | 0.0 | 0.0 | 0 | 0 | 1 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 3.0 | 2.0 | 0.0 | 0.0 | 0.0 |
| 4 | 2347395 | Undefined | Android OS | MAX_NUM += 1 | True | 2768.0 | 5.0 | 3.0 | Great | Machine Learning | 3.894166 | 22.0 | 0.0 | 7.0 | 4.0 | 11.0 | 0.0 | 10.0 | 0 | 0 | 0 | 2.0 | 0.0 | 0.0 | 0.0 | 0.0 | 4.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 4.0 |
num_chars_resume. So, normalizing by dividing it with 1000.X and labes are in Yfinal_data['num_chars_resume_thousands'] = final_data['num_chars_resume']/1000
final_data['resume_flag'] = final_data['resume_flag'].astype(int)
predictors = ['resume_flag', 'num_chars_resume_thousands', 'years_of_experience','years_of_remote_experience',
'sn_avg_score','num_self_skills', 'num_self_beginner_skills', 'num_self_intermediate_skills',
'num_self_advanced_skills', 'num_self_expert_skills', 'is_github', 'time_to_upload_resume',
'time_to_sn_test','is_quiz_answer_correct']
#top_skill.extend(['dev_id'])
top_skill.remove('dev_id')
predictors.extend(top_skill)
ml_data = final_data[predictors]
dev_role = pd.get_dummies(final_data['role_type'], prefix='role')
dev_os_type = pd.get_dummies(final_data['user_os_type'], prefix='os_type')
dev_source_attribution = pd.get_dummies(final_data['source_attribution_type'], prefix='source_attribution')
dev_english = pd.get_dummies(final_data['english_communication'], prefix='english')
ml_data = final_data[predictors]
ml_data = pd.concat([ml_data,dev_role],axis=1)
ml_data = pd.concat([ml_data,dev_os_type],axis=1)
ml_data = pd.concat([ml_data,dev_source_attribution],axis=1)
ml_data = pd.concat([ml_data,dev_english],axis=1)
ml_data = ml_data.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
final_predictiors = ml_data.columns
X = ml_data
Y = final_data['label']
dev_id = final_data['dev_id']
X.head()
| resume_flag | num_chars_resume_thousands | years_of_experience | years_of_remote_experience | sn_avg_score | num_self_skills | num_self_beginner_skills | num_self_intermediate_skills | num_self_advanced_skills | num_self_expert_skills | is_github | time_to_upload_resume | time_to_sn_test | is_quiz_answer_correct | self_skill_MySQL | self_skill_Expressjs | self_skill_Laravel | self_skill_PHP | self_skill_RESTRESTfulAPIs | self_skill_SQL | self_skill_Nodejs | self_skill_Vuejs | self_skill_PHPMySQL | self_skill_JavaScript | self_skill_React | self_skill_PHPLaravel | self_skill_Git | self_skill_Python | role_BackendSystems | role_CloudAIML | role_CloudData | role_CloudNetworking | role_CloudSecurity | role_CloudSolutionsArchitecture | role_DevOps | role_FullStack | role_FullstackBEheavy | role_FullstackFEheavy | role_MachineLearning | role_Mobile | role_Other | role_Research | role_WebBackend | role_WebFrontend | os_type_AndroidOS | os_type_Linux | os_type_MacOS | os_type_Undefined | os_type_Windows | os_type_iOS | source_attribution_Customer | source_attribution_Facebook | source_attribution_Github | source_attribution_Google | source_attribution_Organic | source_attribution_Outbound | source_attribution_Quora | source_attribution_Remarketing | source_attribution_Undefined | source_attribution_Youtube | source_attribution_ZFD | english_Average | english_Basic | english_Great | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 3.557 | 10.0 | 5.0 | 4.707500 | 20.0 | 2.0 | 8.0 | 4.0 | 6.0 | 0 | 0.0 | 0.0 | 0 | 4.0 | 0.0 | 4.0 | 4.0 | 1.0 | 0.0 | 0.0 | 3.0 | 0.0 | 3.0 | 0.0 | 0.0 | 4.0 | 0.0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 1 | 1 | 1.720 | 5.0 | 5.0 | 3.805000 | 19.0 | 0.0 | 6.0 | 5.0 | 8.0 | 1 | 1.0 | 1.0 | 1 | 2.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 4.0 | 0.0 | 0.0 | 4.0 | 0.0 | 0.0 | 4.0 | 0.0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 2 | 1 | 1.606 | 1.0 | 0.0 | 2.577500 | 10.0 | 10.0 | 0.0 | 0.0 | 0.0 | 1 | 0.0 | 46.0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 3 | 1 | 2.973 | 4.0 | 2.0 | 3.734166 | 14.0 | 2.0 | 4.0 | 6.0 | 2.0 | 0 | 0.0 | 0.0 | 1 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 3.0 | 2.0 | 0.0 | 0.0 | 0.0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
| 4 | 1 | 2.768 | 5.0 | 3.0 | 3.894166 | 22.0 | 0.0 | 7.0 | 4.0 | 11.0 | 0 | 0.0 | 10.0 | 0 | 2.0 | 0.0 | 0.0 | 0.0 | 0.0 | 4.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 4.0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 |
from scipy.stats import chi2_contingency
li = ['user_os_type', 'source_attribution_type', 'quiz_answer', 'resume_flag', 'english_communication'\
, 'role_type','is_github']
li.extend(top_skill)
chi2tests = pd.DataFrame(columns=['feature', 'Fstatistics', 'pvalue'])
ind=0
for i in li:
con_tab = pd.crosstab(final_data[i], Y).reset_index()
con_tab.columns.name = None
stat, p, dof, expected = chi2_contingency(con_tab.iloc[:, 1:])
chi2tests.at[ind, 'feature'] = i
chi2tests.at[ind, 'Fstatistics'] = stat
chi2tests.at[ind, 'pvalue'] = p
ind = ind+1
chi2tests.sort_values(by='Fstatistics', ascending=False, inplace=True)
chi2tests['Fstatistics'] = chi2tests['Fstatistics'].astype(int)
import plotly.express as px
fig = px.bar(chi2tests, x="feature", y="Fstatistics", hover_data=['Fstatistics', 'pvalue'], text='Fstatistics', title='Fstatistics and pvalue for categorical features.')
fig.update_traces(textfont_size=12, textangle=0, textposition="inside", cliponaxis=False)
fig.show()
chi2tests.reset_index(drop=True)
| feature | Fstatistics | pvalue | |
|---|---|---|---|
| 0 | role_type | 1805 | 0.0 |
| 1 | self_skill_Node.js | 1559 | 0.0 |
| 2 | self_skill_JavaScript | 1516 | 0.0 |
| 3 | self_skill_PHP | 1401 | 0.0 |
| 4 | self_skill_Laravel | 1254 | 0.0 |
| 5 | self_skill_Vue.js | 1211 | 0.0 |
| 6 | self_skill_MySQL | 1183 | 0.0 |
| 7 | self_skill_REST/RESTful APIs | 1078 | 0.0 |
| 8 | self_skill_PHP, Laravel | 935 | 0.0 |
| 9 | self_skill_PHP/MySQL | 932 | 0.0 |
| 10 | self_skill_Express.js | 930 | 0.0 |
| 11 | self_skill_SQL | 704 | 0.0 |
| 12 | self_skill_React | 683 | 0.0 |
| 13 | is_github | 678 | 0.0 |
| 14 | self_skill_Git | 530 | 0.0 |
| 15 | source_attribution_type | 390 | 0.0 |
| 16 | english_communication | 185 | 0.0 |
| 17 | resume_flag | 134 | 0.0 |
| 18 | self_skill_Python | 103 | 0.0 |
| 19 | user_os_type | 98 | 0.0 |
| 20 | quiz_answer | 60 | 0.0 |
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt
corrMatrix = final_data[['is_github','resume_flag', 'num_chars_resume', 'years_of_experience',\
'years_of_remote_experience', 'sn_avg_score', 'num_self_skills', 'num_self_beginner_skills',\
'num_self_intermediate_skills','num_self_advanced_skills','num_self_expert_skills']].corr()
plt.figure(figsize=(8, 5))
sn.heatmap(corrMatrix, annot=True)
plt.show()
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=1)
print(f"X_train: {X_train.shape} \ny_train: {y_train.shape} \nX_test: {X_test.shape} \ny_test: {y_test.shape}")
print(f"Postive class in train data is {round(100*sum(y_train)/len(y_train),1)}% & Postive class in test data is {round(100*sum(y_test)/len(y_test),1)}%")
from lightgbm import LGBMClassifier
lgb = LGBMClassifier(is_unbalance=True, objective='binary', max_depth=8, min_split_gain = 0.01)
lgb.fit(X_train, y_train)
print('Testing accuracy')
print(classification_report(y_test, lgb.predict(X_test)))
#print('Traing accuracy')
#print(classification_report(y_train, model.predict(X_train)))
lgbm_shap = lgb.predict(X_test, pred_contrib=True)
shap.summary_plot(lgbm_shap[:,:-1], X_test)
X_train: (34633, 64)
y_train: (34633,)
X_test: (14844, 64)
y_test: (14844,)
Postive class in train data is 24.1% & Postive class in test data is 23.3%
Testing accuracy
precision recall f1-score support
0 0.89 0.67 0.76 11388
1 0.40 0.72 0.51 3456
accuracy 0.68 14844
macro avg 0.64 0.69 0.64 14844
weighted avg 0.77 0.68 0.70 14844
y_test_k, y_pred_k = kfold_report(lgb,X,Y)
# plot roc curve from k fold reults
plot_roc_curve(y_test=y_test_k, y_pred=y_pred_k)
# plot roc curve from single fit
y_pred = lgb.predict_proba(X_test)
y_pred = y_pred[:,1]
plot_precision_recall_curve(y_test, y_pred)
Average Score from cross validation with cv=10
LGBMClassifier(is_unbalance=True) : {'roc_auc': 0.764, 'f1': 0.517, 'precision': 0.41, 'recall': 0.71, 'accuracy': 0.681}
Mean Recall from Kfold : 0.712
Over all classification report
precision recall f1-score support
0 0.88 0.68 0.77 75368
1 0.41 0.71 0.52 23586
accuracy 0.68 98954
macro avg 0.64 0.69 0.64 98954
weighted avg 0.77 0.68 0.71 98954
from catboost import CatBoostClassifier, Pool
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=1)
print(f"X_train: {X_train.shape} \ny_train: {y_train.shape} \nX_test: {X_test.shape} \ny_test: {y_test.shape}")
print(f"Postive class in train data is {round(100*sum(y_train)/len(y_train),1)}% & Postive class in test data is {round(100*sum(y_test)/len(y_test),1)}%")
## CatBoostClassifier
catboot = CatBoostClassifier(verbose=False, class_weights={0:1, 1:4})
catboot.fit(X_train, y_train)
print('Testing accuracy')
print(classification_report(y_test, catboot.predict(X_test)))
#print('Traing accuracy')
#print(classification_report(y_train, catboot.predict(X_train)))
shap_values = catboot.get_feature_importance(Pool(X_test, label=y_test) ,type="ShapValues")
expected_value = shap_values[0,-1]
shap_values = shap_values[:,:-1]
shap.summary_plot(shap_values, X_test)
X_train: (34633, 64)
y_train: (34633,)
X_test: (14844, 64)
y_test: (14844,)
Postive class in train data is 24.1% & Postive class in test data is 23.3%
Testing accuracy
precision recall f1-score support
0 0.89 0.63 0.74 11388
1 0.38 0.76 0.51 3456
accuracy 0.66 14844
macro avg 0.64 0.69 0.63 14844
weighted avg 0.78 0.66 0.69 14844
y_test_k, y_pred_k = kfold_report(catboot,X,Y)
# plot roc curve from k fold reults
plot_roc_curve(y_test=y_test_k, y_pred=y_pred_k)
# plot roc curve from single fit
y_pred = catboot.predict_proba(X_test)
y_pred = y_pred[:,1]
plot_precision_recall_curve(y_test, y_pred)
Average Score from cross validation with cv=10
<catboost.core.CatBoostClassifier object at 0x000002E1092E3BB0> : {'roc_auc': 0.763, 'f1': 0.513, 'precision': 0.393, 'recall': 0.749, 'accuracy': 0.659}
Mean Recall from Kfold : 0.748
Over all classification report
precision recall f1-score support
0 0.89 0.64 0.74 75368
1 0.39 0.75 0.51 23586
accuracy 0.66 98954
macro avg 0.64 0.69 0.63 98954
weighted avg 0.77 0.66 0.69 98954
#explainer = shap.Explainer(model)
#shap_values = explainer(X_test)
#shap.plots.beeswarm(shap_values)
#shap.plots.bar(shap_values)
from lightgbm import LGBMClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
# define model
lgb_model = LGBMClassifier(is_unbalance=True, objective='binary')
# define grid
param_grid = dict(max_depth=range(2,12,2), min_split_gain=[val/100 for val in list(range(0,10,2))])
# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=2, n_repeats=2, random_state=1)
# define grid search
grid = GridSearchCV(estimator=lgb_model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring='f1')
# execute the grid search
grid_result = grid.fit(X, Y)
# report the best configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# report all configurations
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
#for mean, stdev, param in zip(means, stds, params):
#print("%f (%f) with: %r" % (mean, stdev, param))
lgb_model = LGBMClassifier(is_unbalance=True, objective='binary', max_depth=grid_result.best_params_['max_depth'],
min_split_gain = grid_result.best_params_['min_split_gain'])
originalclass = [] # to store true label for each iteration
predictedclass = [] # to store predicted label for each iteration
def custom_scoring_function(y_true, y_pred):
#print(classification_report(y_true, y_pred)) # print classification report for each iteration
originalclass.extend(y_true)
predictedclass.extend(y_pred)
return recall_score(y_true, y_pred) # return accuracy score , here we can pass f1_score as well with average parameter
nested_score = cross_val_score(lgb_model, X=X, y=Y, cv=10,scoring=make_scorer(custom_scoring_function))
print("Mean Accuracy from Kfold : %.3f" % np.mean(nested_score))
print('\n Over all classification report')
print(classification_report(originalclass, predictedclass))
Best: 0.515166 using {'max_depth': 4, 'min_split_gain': 0.08}
Mean Accuracy from Kfold : 0.721
Over all classification report
precision recall f1-score support
0 0.88 0.66 0.76 37684
1 0.40 0.72 0.52 11793
accuracy 0.68 49477
macro avg 0.64 0.69 0.64 49477
weighted avg 0.77 0.68 0.70 49477